{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "pos = []\n",
    "with open(\"./hotel/pos.txt\",\"r\",encoding='utf-8') as f:\n",
    "    for line in  f.readlines():\n",
    "        pos.append(line.strip())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "neg = []\n",
    "with open(\"./hotel/neg.txt\",'r',encoding='utf-8') as f:\n",
    "    for line in f.readlines():\n",
    "        neg.append(line.strip())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "pos_with_tag = [(li,'pos') for li in pos]\n",
    "neg_with_tag = [(li,'neg') for li in neg]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_datas  =[] \n",
    "all_datas.extend(neg_with_tag)\n",
    "all_datas.extend(pos_with_tag)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame(all_datas,columns=[\"sentence\",\"label\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.utils import shuffle\n",
    "df = shuffle(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_pos = pd.DataFrame(pos_with_tag,columns=['sentence','label'])\n",
    "df_neg = pd.DataFrame(neg_with_tag,columns=['sentence','label'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_pos_shuffle = shuffle(df_pos)\n",
    "df_neg_shuffle = shuffle(df_neg)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_pos_test = df_pos_shuffle[:1000]\n",
    "df_pos_train = df_pos_shuffle[1000:]\n",
    "df_neg_test = df_neg_shuffle[:1000]\n",
    "df_neg_train = df_neg_shuffle[1000:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "sentence    2000\n",
       "label       2000\n",
       "dtype: int64"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_test = shuffle(pd.concat([df_pos_test,df_neg_test],axis=0))\n",
    "df_test.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>sentence</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>2035</th>\n",
       "      <td>房间还行，交通也方便，当然火车站周边略显嘈杂，总体还行。</td>\n",
       "      <td>pos</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3050</th>\n",
       "      <td>因为看了之前的评论才定的这家酒店，没想到换来了一个惊喜，房间的大窗户对着小花园，没想到能住在...</td>\n",
       "      <td>pos</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3007</th>\n",
       "      <td>一个不错的酒店，如果不抽烟的话就住4层的无烟客房，适合一家人度假住，我在那里住了4个晚上，当...</td>\n",
       "      <td>pos</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4461</th>\n",
       "      <td>房间太小，</td>\n",
       "      <td>neg</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>288</th>\n",
       "      <td>总体可以！就是没有窗户！</td>\n",
       "      <td>neg</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                               sentence label\n",
       "2035                       房间还行，交通也方便，当然火车站周边略显嘈杂，总体还行。   pos\n",
       "3050  因为看了之前的评论才定的这家酒店，没想到换来了一个惊喜，房间的大窗户对着小花园，没想到能住在...   pos\n",
       "3007  一个不错的酒店，如果不抽烟的话就住4层的无烟客房，适合一家人度假住，我在那里住了4个晚上，当...   pos\n",
       "4461                                              房间太小，   neg\n",
       "288                                        总体可以！就是没有窗户！   neg"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_test.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train =shuffle(pd.concat([df_pos_train,df_neg_train],axis=0))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "sentence    8000\n",
       "label       8000\n",
       "dtype: int64"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_train.count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>sentence</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1093</th>\n",
       "      <td>酒店服务非常好，比金茂君悦和浦东香格里拉的新楼都好，但价格却比他们便宜一点。服务员都训练有素...</td>\n",
       "      <td>pos</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1504</th>\n",
       "      <td>酒店的装修确实还不错,就是前面在修地铁,影响很大,看见这个酒店,坐车最起码还要二十分钟才能到...</td>\n",
       "      <td>pos</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>481</th>\n",
       "      <td>这次住的丽晶，实在是令人气愤。酒店浴室没有防滑垫，铺了个毛巾在地上冲凉。怎奈毛巾太小，还是摔...</td>\n",
       "      <td>neg</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4910</th>\n",
       "      <td>在浦东，价位不算太高的宾馆了吧！</td>\n",
       "      <td>neg</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2873</th>\n",
       "      <td>2/17除夕夜订房预订240标间感觉不错!所以2/25再次入住提升为豪标270元,没想到.....</td>\n",
       "      <td>neg</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                               sentence label\n",
       "1093  酒店服务非常好，比金茂君悦和浦东香格里拉的新楼都好，但价格却比他们便宜一点。服务员都训练有素...   pos\n",
       "1504  酒店的装修确实还不错,就是前面在修地铁,影响很大,看见这个酒店,坐车最起码还要二十分钟才能到...   pos\n",
       "481   这次住的丽晶，实在是令人气愤。酒店浴室没有防滑垫，铺了个毛巾在地上冲凉。怎奈毛巾太小，还是摔...   neg\n",
       "4910                                   在浦东，价位不算太高的宾馆了吧！   neg\n",
       "2873  2/17除夕夜订房预订240标间感觉不错!所以2/25再次入住提升为豪标270元,没想到.....   neg"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train.to_csv(\"./datas/train.csv\",index=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_test.to_csv(\"./datas/test.csv\",index=None)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
